# load in required libraries
pacman::p_load(
  here,        
  janitor,    
  lubridate,  
  tidyverse,
  skimr
)
# load in raw datasets
airlines <- read_csv(here("raw_data/airlines.csv"))
Rows: 12 Columns: 2
── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): carrier, name

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
airports <- read_csv(here("raw_data/airports.csv"))
Rows: 1333 Columns: 8
── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): faa, name, dst, tzone
dbl (4): lat, lon, alt, tz

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
flights <- read_csv(here("raw_data/flights.csv"))
Rows: 303748 Columns: 19
── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (4): carrier, tailnum, origin, dest
dbl  (14): year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, ...
dttm  (1): time_hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
planes <- read_csv(here("raw_data/planes.csv"))
Rows: 3521 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): tailnum, type, manufacturer, model, engine
dbl (4): year, engines, seats, speed

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weather <- read_csv(here("raw_data/weather.csv"))
Rows: 26201 Columns: 15
── Column specification ───────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): origin
dbl  (13): year, month, day, hour, temp, dewp, humid, wind_dir, wind_speed, wind_gust, precip, pressure...
dttm  (1): time_hour

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Initial Data Exploration

Use summary to perform an initial exploration of the datasets, noting where there are large numbers of NA values. Variable types will be addressed once we have a cleaned, combined dataset.

# explore the data
summary(airlines)
summary(airports)
summary(planes)
summary(weather)
summary(flights)

skim(flights)
skim(weather)

Initial Data Cleaning

There are a large number of missing values in the weather and flights dataset. The proportion of the dataset with missing values is calcualted below.

# calculate proportion of the data which are NAs
weather %>% 
  summarise_each(funs(100 * mean(is.na(.))))
Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
Please use `across()` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
Warning: `funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
flights %>% 
  summarise_each(funs(100 * mean(is.na(.))))

The proportion of NAs in the flights dataset is less than 3% and therefore of little concern, however as the NAs are in columns that I will be using in the analysis, I can drop these from the dataset.

# drop NAs from flight data since these observations won't be useful in the analysis
flights <- flights %>% 
  drop_na()

There are variables in the weather dataset which have a proportion of missing values greater than 90%, there is little value in keeping these variables in the data and therefore they can be dropped.

# drop variables where proportion of NAs is greater than 90%
weather <- weather %>% 
  select(-c(temp, dewp, humid, precip, pressure))

I have chosen to drop three columns from the airports dataset - I don’t foresee these variables being of much consequence to the analysis.

airports <- airports %>% 
  select(-c(tz, dst, tzone))

Joining the Datasets

The flights dataset can be joined to the airlines, planes and airports datasets (the join is repeated for the airport dataset so we have corresponding information for both the origin and destination airports)

# join flights dataset with airlines, planes and airport datasets
flights_joined <- flights %>% 
  left_join(airlines, by = "carrier") %>% 
  left_join(planes, by = "tailnum") %>% 
  left_join(airports, by = c("origin" = "faa")) %>% 
  left_join(airports, by = c("dest" = "faa"))
flights_joined
ewr_flights <- flights_joined %>% 
  filter(origin == "EWR")
jfk_flights <- flights_joined %>% 
  filter(origin == "JFK")
lga_flights <- flights_joined %>% 
  filter(origin == "LGA")
ewr_weather <- weather %>% 
  filter(origin == "EWR")
jfk_weather <- weather %>% 
  filter(origin == "JFK")
lga_weather <- weather %>% 
  filter(origin == "LGA")
ewr_flights_joined <- ewr_flights %>% 
  left_join(ewr_weather, by = "time_hour")
ewr_flights_joined
jfk_flights_joined <- jfk_flights %>% 
  left_join(jfk_weather, by = "time_hour")
jfk_flights_joined
lga_flights_joined <- lga_flights %>% 
  left_join(lga_weather, by = "time_hour")
lga_flights_joined
class(x)
[1] "times"
colnames
 [1] "orig_depart_date"  "orig_depart_time"  "sched_depart_time" "dep_delay"         "orig_arr_time"    
 [6] "sched_arr_time"    "arr_delay"         "carrier"           "flight_number"     "tail_number"      
[11] "origin.x"          "origin_name"       "dest"              "flight_time"       "distance"         
[16] "time_hour"         "name.x"            "year"              "type"              "manufacturer"     
[21] "model"             "seats"             "engine"            "name.y"            "lat"              
[26] "lon"               "alt"               "origin.y"          "wind_dir"          "wind_speed"       
[31] "wind_gust"         "visib"            
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CiMgbG9hZCBpbiByZXF1aXJlZCBsaWJyYXJpZXMKcGFjbWFuOjpwX2xvYWQoCiAgaGVyZSwgICAgICAgIAogIGphbml0b3IsICAgIAogIGx1YnJpZGF0ZSwgIAogIHRpZHl2ZXJzZSwKICBza2ltcgopCmBgYAoKYGBge3J9CiMgbG9hZCBpbiByYXcgZGF0YXNldHMKYWlybGluZXMgPC0gcmVhZF9jc3YoaGVyZSgicmF3X2RhdGEvYWlybGluZXMuY3N2IikpCmFpcnBvcnRzIDwtIHJlYWRfY3N2KGhlcmUoInJhd19kYXRhL2FpcnBvcnRzLmNzdiIpKQpmbGlnaHRzIDwtIHJlYWRfY3N2KGhlcmUoInJhd19kYXRhL2ZsaWdodHMuY3N2IikpCnBsYW5lcyA8LSByZWFkX2NzdihoZXJlKCJyYXdfZGF0YS9wbGFuZXMuY3N2IikpCndlYXRoZXIgPC0gcmVhZF9jc3YoaGVyZSgicmF3X2RhdGEvd2VhdGhlci5jc3YiKSkKYGBgCgojIEluaXRpYWwgRGF0YSBFeHBsb3JhdGlvbgoKVXNlIHN1bW1hcnkgdG8gcGVyZm9ybSBhbiBpbml0aWFsIGV4cGxvcmF0aW9uIG9mIHRoZSBkYXRhc2V0cywgbm90aW5nIHdoZXJlIHRoZXJlIGFyZSBsYXJnZSBudW1iZXJzIG9mIE5BIHZhbHVlcy4gVmFyaWFibGUgdHlwZXMgd2lsbCBiZSBhZGRyZXNzZWQgb25jZSB3ZSBoYXZlIGEgY2xlYW5lZCwgY29tYmluZWQgZGF0YXNldC4KCmBgYHtyfQojIGV4cGxvcmUgdGhlIGRhdGEKc3VtbWFyeShhaXJsaW5lcykKc3VtbWFyeShhaXJwb3J0cykKc3VtbWFyeShwbGFuZXMpCnN1bW1hcnkod2VhdGhlcikKc3VtbWFyeShmbGlnaHRzKQoKc2tpbShmbGlnaHRzKQpza2ltKHdlYXRoZXIpCmBgYAoKIyBJbml0aWFsIERhdGEgQ2xlYW5pbmcKClRoZXJlIGFyZSBhIGxhcmdlIG51bWJlciBvZiBtaXNzaW5nIHZhbHVlcyBpbiB0aGUgd2VhdGhlciBhbmQgZmxpZ2h0cyBkYXRhc2V0LiBUaGUgcHJvcG9ydGlvbiBvZiB0aGUgZGF0YXNldCB3aXRoIG1pc3NpbmcgdmFsdWVzIGlzIGNhbGN1YWx0ZWQgYmVsb3cuCgpgYGB7cn0KIyBjYWxjdWxhdGUgcHJvcG9ydGlvbiBvZiB0aGUgZGF0YSB3aGljaCBhcmUgTkFzCndlYXRoZXIgJT4lIAogIHN1bW1hcmlzZV9lYWNoKGZ1bnMoMTAwICogbWVhbihpcy5uYSguKSkpKQoKZmxpZ2h0cyAlPiUgCiAgc3VtbWFyaXNlX2VhY2goZnVucygxMDAgKiBtZWFuKGlzLm5hKC4pKSkpCmBgYAoKVGhlIHByb3BvcnRpb24gb2YgTkFzIGluIHRoZSBmbGlnaHRzIGRhdGFzZXQgaXMgbGVzcyB0aGFuIDMlIGFuZCB0aGVyZWZvcmUgb2YgbGl0dGxlIGNvbmNlcm4sIGhvd2V2ZXIgYXMgdGhlIE5BcyBhcmUgaW4gY29sdW1ucyB0aGF0IEkgd2lsbCBiZSB1c2luZyBpbiB0aGUgYW5hbHlzaXMsIEkgY2FuIGRyb3AgdGhlc2UgZnJvbSB0aGUgZGF0YXNldC4KCmBgYHtyfQojIGRyb3AgTkFzIGZyb20gZmxpZ2h0IGRhdGEgc2luY2UgdGhlc2Ugb2JzZXJ2YXRpb25zIHdvbid0IGJlIHVzZWZ1bCBpbiB0aGUgYW5hbHlzaXMKZmxpZ2h0cyA8LSBmbGlnaHRzICU+JSAKICBkcm9wX25hKCkKYGBgCgpUaGVyZSBhcmUgdmFyaWFibGVzIGluIHRoZSB3ZWF0aGVyIGRhdGFzZXQgd2hpY2ggaGF2ZSBhIHByb3BvcnRpb24gb2YgbWlzc2luZyB2YWx1ZXMgZ3JlYXRlciB0aGFuIDkwJSwgdGhlcmUgaXMgbGl0dGxlIHZhbHVlIGluIGtlZXBpbmcgdGhlc2UgdmFyaWFibGVzIGluIHRoZSBkYXRhIGFuZCB0aGVyZWZvcmUgdGhleSBjYW4gYmUgZHJvcHBlZC4KCmBgYHtyfQojIGRyb3AgdmFyaWFibGVzIHdoZXJlIHByb3BvcnRpb24gb2YgTkFzIGlzIGdyZWF0ZXIgdGhhbiA5MCUKd2VhdGhlciA8LSB3ZWF0aGVyICU+JSAKICBzZWxlY3QoLWModGVtcCwgZGV3cCwgaHVtaWQsIHByZWNpcCwgcHJlc3N1cmUpKQpgYGAKCkkgaGF2ZSBjaG9zZW4gdG8gZHJvcCB0aHJlZSBjb2x1bW5zIGZyb20gdGhlIGFpcnBvcnRzIGRhdGFzZXQgLSBJIGRvbid0IGZvcmVzZWUgdGhlc2UgdmFyaWFibGVzIGJlaW5nIG9mIG11Y2ggY29uc2VxdWVuY2UgdG8gdGhlIGFuYWx5c2lzLgoKYGBge3J9CmFpcnBvcnRzIDwtIGFpcnBvcnRzICU+JSAKICBzZWxlY3QoLWModHosIGRzdCwgdHpvbmUpKQpgYGAKCiMgSm9pbmluZyB0aGUgRGF0YXNldHMKClRoZSBmbGlnaHRzIGRhdGFzZXQgY2FuIGJlIGpvaW5lZCB0byB0aGUgYWlybGluZXMsIHBsYW5lcyBhbmQgYWlycG9ydHMgZGF0YXNldHMgKHRoZSBqb2luIGlzIHJlcGVhdGVkIGZvciB0aGUgYWlycG9ydCBkYXRhc2V0IHNvIHdlIGhhdmUgY29ycmVzcG9uZGluZyBpbmZvcm1hdGlvbiBmb3IgYm90aCB0aGUgb3JpZ2luIGFuZCBkZXN0aW5hdGlvbiBhaXJwb3J0cykKCmBgYHtyfQojIGpvaW4gZmxpZ2h0cyBkYXRhc2V0IHdpdGggYWlybGluZXMsIHBsYW5lcyBhbmQgYWlycG9ydCBkYXRhc2V0cwpmbGlnaHRzX2pvaW5lZCA8LSBmbGlnaHRzICU+JSAKICBsZWZ0X2pvaW4oYWlybGluZXMsIGJ5ID0gImNhcnJpZXIiKSAlPiUgCiAgbGVmdF9qb2luKHBsYW5lcywgYnkgPSAidGFpbG51bSIpICU+JSAKICBsZWZ0X2pvaW4oYWlycG9ydHMsIGJ5ID0gYygib3JpZ2luIiA9ICJmYWEiKSkgJT4lIAogIGxlZnRfam9pbihhaXJwb3J0cywgYnkgPSBjKCJkZXN0IiA9ICJmYWEiKSkKZmxpZ2h0c19qb2luZWQKYGBgCgpgYGB7cn0KZXdyX2ZsaWdodHMgPC0gZmxpZ2h0c19qb2luZWQgJT4lIAogIGZpbHRlcihvcmlnaW4gPT0gIkVXUiIpCmpma19mbGlnaHRzIDwtIGZsaWdodHNfam9pbmVkICU+JSAKICBmaWx0ZXIob3JpZ2luID09ICJKRksiKQpsZ2FfZmxpZ2h0cyA8LSBmbGlnaHRzX2pvaW5lZCAlPiUgCiAgZmlsdGVyKG9yaWdpbiA9PSAiTEdBIikKYGBgCgpgYGB7cn0KZXdyX3dlYXRoZXIgPC0gd2VhdGhlciAlPiUgCiAgZmlsdGVyKG9yaWdpbiA9PSAiRVdSIikKamZrX3dlYXRoZXIgPC0gd2VhdGhlciAlPiUgCiAgZmlsdGVyKG9yaWdpbiA9PSAiSkZLIikKbGdhX3dlYXRoZXIgPC0gd2VhdGhlciAlPiUgCiAgZmlsdGVyKG9yaWdpbiA9PSAiTEdBIikKYGBgCgpgYGB7cn0KZXdyX2ZsaWdodHNfam9pbmVkIDwtIGV3cl9mbGlnaHRzICU+JSAKICBsZWZ0X2pvaW4oZXdyX3dlYXRoZXIsIGJ5ID0gInRpbWVfaG91ciIpCmV3cl9mbGlnaHRzX2pvaW5lZApgYGAKCmBgYHtyfQpqZmtfZmxpZ2h0c19qb2luZWQgPC0gamZrX2ZsaWdodHMgJT4lIAogIGxlZnRfam9pbihqZmtfd2VhdGhlciwgYnkgPSAidGltZV9ob3VyIikKamZrX2ZsaWdodHNfam9pbmVkCmBgYAoKYGBge3J9CmxnYV9mbGlnaHRzX2pvaW5lZCA8LSBsZ2FfZmxpZ2h0cyAlPiUgCiAgbGVmdF9qb2luKGxnYV93ZWF0aGVyLCBieSA9ICJ0aW1lX2hvdXIiKQpsZ2FfZmxpZ2h0c19qb2luZWQKYGBgCgpgYGB7cn0KZmxpZ2h0c19qb2luZWQgPC0gYmluZF9yb3dzKGV3cl9mbGlnaHRzX2pvaW5lZCwgamZrX2ZsaWdodHNfam9pbmVkLCBsZ2FfZmxpZ2h0c19qb2luZWQpCmBgYAoKYGBge3J9CmEgPC0gZmxpZ2h0c19qb2luZWQgJT4lIAogIG11dGF0ZShhY3Jvc3MoLmNvbHMgPSBjKGRlcF90aW1lLCBzY2hlZF9kZXBfdGltZSwgYXJyX3RpbWUsIHNjaGVkX2Fycl90aW1lKSwgCiAgICAgICAgICAuZm5zID0gfiBzdHJfcGFkKC54LCB3aWR0aCA9IDQsIHBhZCA9IDApKSkgJT4lIAogIG11dGF0ZShkZXB0X2RhdGV0aW1lID0gYXMuRGF0ZShzdHJfYyh5ZWFyLngsIG1vbnRoLngsIGRheS54LCBkZXBfdGltZSwgc2VwID0gIi8iKSkpCgolPiUgCiAgbXV0YXRlKHltZChkZXB0X2RhdGV0aW1lLCBmb3JtYXQgPSApKQoKCiU+JSAKICBtdXRhdGUoZGVwYXJ0dXJlX3RpbWUgPSBtYWtlX2RhdGV0aW1lKHllYXIueCwgbW9udGgueCwgZGF5LngsIGRlcF90aW1lKSkKCiU+JSAKICBtdXRhdGUoYWNyb3NzKC5jb2xzID0gYyhkZXBfdGltZSwgc2NoZWRfZGVwX3RpbWUsIGFycl90aW1lLCBzY2hlZF9hcnJfdGltZSksIAogICAgICAgICAgLmZucyA9IH4gc3RyX3JlcGxhY2UoLngsICJeKC4uKSIsICJcXDE6IikpKSAlPiUgCiAgbXV0YXRlKGRlcGFydHVyZV90aW1lID0gbWFrZV9kYXRlKHllYXIueCwgbW9udGgueCwgZGF5LngsIGFzLmludGVnZXIoZGVwX3RpbWUpKSkKCgolPiUgCiAgbXV0YXRlKGFjcm9zcyguY29scyA9IGMoZGVwX3RpbWUsIHNjaGVkX2RlcF90aW1lLCBhcnJfdGltZSwgc2NoZWRfYXJyX3RpbWUpLCAKICAgICAgICAgIC5mbnMgPSB+IGhtczo6YXMuaG1zKC54KSkpCiAgCmhtcyhhJGRlcF90aW1lLCAiSE0iKQoKYQo/aG1zCmBgYAoKYGBge3J9CnRpbWU8LWMoIjAwOjAwOjAxIiwgIjAxOjAyOjAwIiwgIjA5OjMwOjAxIiwgIjE0OjE1OjI1IikKCnggPC0gY2hyb24odGltZXM9dGltZSkKCmNsYXNzKHgpCmBgYAoKYGBge3J9CmNvbG5hbWVzIDwtIGNvbG5hbWVzKGZsaWdodHNfY2xlYW5lZCkKY29sbmFtZXMKYGBgCgo=